import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
color = sns.color_palette('muted')[0:5]
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, roc_curve
from sklearn.tree import DecisionTreeClassifier
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, make_scorer, precision_score, recall_score, f1_score, roc_auc_score, roc_curve
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import sklearn.metrics as metrics
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, roc_curve
from sklearn.tree import DecisionTreeClassifier
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, make_scorer, precision_score, recall_score, f1_score, roc_auc_score, roc_curve
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.decomposition import FactorAnalysis
data = pd.read_csv("./WA_Fn-UseC_-HR-Employee-Attrition.csv")
data.head()
| Age | Attrition | BusinessTravel | DailyRate | Department | DistanceFromHome | Education | EducationField | EmployeeCount | EmployeeNumber | ... | RelationshipSatisfaction | StandardHours | StockOptionLevel | TotalWorkingYears | TrainingTimesLastYear | WorkLifeBalance | YearsAtCompany | YearsInCurrentRole | YearsSinceLastPromotion | YearsWithCurrManager | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 41 | Yes | Travel_Rarely | 1102 | Sales | 1 | 2 | Life Sciences | 1 | 1 | ... | 1 | 80 | 0 | 8 | 0 | 1 | 6 | 4 | 0 | 5 |
| 1 | 49 | No | Travel_Frequently | 279 | Research & Development | 8 | 1 | Life Sciences | 1 | 2 | ... | 4 | 80 | 1 | 10 | 3 | 3 | 10 | 7 | 1 | 7 |
| 2 | 37 | Yes | Travel_Rarely | 1373 | Research & Development | 2 | 2 | Other | 1 | 4 | ... | 2 | 80 | 0 | 7 | 3 | 3 | 0 | 0 | 0 | 0 |
| 3 | 33 | No | Travel_Frequently | 1392 | Research & Development | 3 | 4 | Life Sciences | 1 | 5 | ... | 3 | 80 | 0 | 8 | 3 | 3 | 8 | 7 | 3 | 0 |
| 4 | 27 | No | Travel_Rarely | 591 | Research & Development | 2 | 1 | Medical | 1 | 7 | ... | 4 | 80 | 1 | 6 | 3 | 3 | 2 | 2 | 2 | 2 |
5 rows × 35 columns
data.columns
Index(['Age', 'Attrition', 'BusinessTravel', 'DailyRate', 'Department',
'DistanceFromHome', 'Education', 'EducationField', 'EmployeeCount',
'EmployeeNumber', 'EnvironmentSatisfaction', 'Gender', 'HourlyRate',
'JobInvolvement', 'JobLevel', 'JobRole', 'JobSatisfaction',
'MaritalStatus', 'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked',
'Over18', 'OverTime', 'PercentSalaryHike', 'PerformanceRating',
'RelationshipSatisfaction', 'StandardHours', 'StockOptionLevel',
'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance',
'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion',
'YearsWithCurrManager'],
dtype='object')
data.shape
(1470, 35)
data.info
<bound method DataFrame.info of Age Attrition BusinessTravel DailyRate Department \
0 41 Yes Travel_Rarely 1102 Sales
1 49 No Travel_Frequently 279 Research & Development
2 37 Yes Travel_Rarely 1373 Research & Development
3 33 No Travel_Frequently 1392 Research & Development
4 27 No Travel_Rarely 591 Research & Development
... ... ... ... ... ...
1465 36 No Travel_Frequently 884 Research & Development
1466 39 No Travel_Rarely 613 Research & Development
1467 27 No Travel_Rarely 155 Research & Development
1468 49 No Travel_Frequently 1023 Sales
1469 34 No Travel_Rarely 628 Research & Development
DistanceFromHome Education EducationField EmployeeCount \
0 1 2 Life Sciences 1
1 8 1 Life Sciences 1
2 2 2 Other 1
3 3 4 Life Sciences 1
4 2 1 Medical 1
... ... ... ... ...
1465 23 2 Medical 1
1466 6 1 Medical 1
1467 4 3 Life Sciences 1
1468 2 3 Medical 1
1469 8 3 Medical 1
EmployeeNumber ... RelationshipSatisfaction StandardHours \
0 1 ... 1 80
1 2 ... 4 80
2 4 ... 2 80
3 5 ... 3 80
4 7 ... 4 80
... ... ... ... ...
1465 2061 ... 3 80
1466 2062 ... 1 80
1467 2064 ... 2 80
1468 2065 ... 4 80
1469 2068 ... 1 80
StockOptionLevel TotalWorkingYears TrainingTimesLastYear \
0 0 8 0
1 1 10 3
2 0 7 3
3 0 8 3
4 1 6 3
... ... ... ...
1465 1 17 3
1466 1 9 5
1467 1 6 0
1468 0 17 3
1469 0 6 3
WorkLifeBalance YearsAtCompany YearsInCurrentRole \
0 1 6 4
1 3 10 7
2 3 0 0
3 3 8 7
4 3 2 2
... ... ... ...
1465 3 5 2
1466 3 7 7
1467 3 6 2
1468 2 9 6
1469 4 4 3
YearsSinceLastPromotion YearsWithCurrManager
0 0 5
1 1 7
2 0 0
3 3 0
4 2 2
... ... ...
1465 0 3
1466 1 7
1467 0 3
1468 0 8
1469 1 2
[1470 rows x 35 columns]>
data.describe()
| Age | DailyRate | DistanceFromHome | Education | EmployeeCount | EmployeeNumber | EnvironmentSatisfaction | HourlyRate | JobInvolvement | JobLevel | ... | RelationshipSatisfaction | StandardHours | StockOptionLevel | TotalWorkingYears | TrainingTimesLastYear | WorkLifeBalance | YearsAtCompany | YearsInCurrentRole | YearsSinceLastPromotion | YearsWithCurrManager | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.0 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 | ... | 1470.000000 | 1470.0 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 |
| mean | 36.923810 | 802.485714 | 9.192517 | 2.912925 | 1.0 | 1024.865306 | 2.721769 | 65.891156 | 2.729932 | 2.063946 | ... | 2.712245 | 80.0 | 0.793878 | 11.279592 | 2.799320 | 2.761224 | 7.008163 | 4.229252 | 2.187755 | 4.123129 |
| std | 9.135373 | 403.509100 | 8.106864 | 1.024165 | 0.0 | 602.024335 | 1.093082 | 20.329428 | 0.711561 | 1.106940 | ... | 1.081209 | 0.0 | 0.852077 | 7.780782 | 1.289271 | 0.706476 | 6.126525 | 3.623137 | 3.222430 | 3.568136 |
| min | 18.000000 | 102.000000 | 1.000000 | 1.000000 | 1.0 | 1.000000 | 1.000000 | 30.000000 | 1.000000 | 1.000000 | ... | 1.000000 | 80.0 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 25% | 30.000000 | 465.000000 | 2.000000 | 2.000000 | 1.0 | 491.250000 | 2.000000 | 48.000000 | 2.000000 | 1.000000 | ... | 2.000000 | 80.0 | 0.000000 | 6.000000 | 2.000000 | 2.000000 | 3.000000 | 2.000000 | 0.000000 | 2.000000 |
| 50% | 36.000000 | 802.000000 | 7.000000 | 3.000000 | 1.0 | 1020.500000 | 3.000000 | 66.000000 | 3.000000 | 2.000000 | ... | 3.000000 | 80.0 | 1.000000 | 10.000000 | 3.000000 | 3.000000 | 5.000000 | 3.000000 | 1.000000 | 3.000000 |
| 75% | 43.000000 | 1157.000000 | 14.000000 | 4.000000 | 1.0 | 1555.750000 | 4.000000 | 83.750000 | 3.000000 | 3.000000 | ... | 4.000000 | 80.0 | 1.000000 | 15.000000 | 3.000000 | 3.000000 | 9.000000 | 7.000000 | 3.000000 | 7.000000 |
| max | 60.000000 | 1499.000000 | 29.000000 | 5.000000 | 1.0 | 2068.000000 | 4.000000 | 100.000000 | 4.000000 | 5.000000 | ... | 4.000000 | 80.0 | 3.000000 | 40.000000 | 6.000000 | 4.000000 | 40.000000 | 18.000000 | 15.000000 | 17.000000 |
8 rows × 26 columns
data.isna().all()
Age False Attrition False BusinessTravel False DailyRate False Department False DistanceFromHome False Education False EducationField False EmployeeCount False EmployeeNumber False EnvironmentSatisfaction False Gender False HourlyRate False JobInvolvement False JobLevel False JobRole False JobSatisfaction False MaritalStatus False MonthlyIncome False MonthlyRate False NumCompaniesWorked False Over18 False OverTime False PercentSalaryHike False PerformanceRating False RelationshipSatisfaction False StandardHours False StockOptionLevel False TotalWorkingYears False TrainingTimesLastYear False WorkLifeBalance False YearsAtCompany False YearsInCurrentRole False YearsSinceLastPromotion False YearsWithCurrManager False dtype: bool
numericData = data.select_dtypes(include = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64'])
plt.subplots(figsize=(13, 8))
sns.heatmap(numericData)
plt.show()
numericData.corr()
| Age | DailyRate | DistanceFromHome | Education | EmployeeCount | EmployeeNumber | EnvironmentSatisfaction | HourlyRate | JobInvolvement | JobLevel | ... | RelationshipSatisfaction | StandardHours | StockOptionLevel | TotalWorkingYears | TrainingTimesLastYear | WorkLifeBalance | YearsAtCompany | YearsInCurrentRole | YearsSinceLastPromotion | YearsWithCurrManager | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Age | 1.000000 | 0.010661 | -0.001686 | 0.208034 | NaN | -0.010145 | 0.010146 | 0.024287 | 0.029820 | 0.509604 | ... | 0.053535 | NaN | 0.037510 | 0.680381 | -0.019621 | -0.021490 | 0.311309 | 0.212901 | 0.216513 | 0.202089 |
| DailyRate | 0.010661 | 1.000000 | -0.004985 | -0.016806 | NaN | -0.050990 | 0.018355 | 0.023381 | 0.046135 | 0.002966 | ... | 0.007846 | NaN | 0.042143 | 0.014515 | 0.002453 | -0.037848 | -0.034055 | 0.009932 | -0.033229 | -0.026363 |
| DistanceFromHome | -0.001686 | -0.004985 | 1.000000 | 0.021042 | NaN | 0.032916 | -0.016075 | 0.031131 | 0.008783 | 0.005303 | ... | 0.006557 | NaN | 0.044872 | 0.004628 | -0.036942 | -0.026556 | 0.009508 | 0.018845 | 0.010029 | 0.014406 |
| Education | 0.208034 | -0.016806 | 0.021042 | 1.000000 | NaN | 0.042070 | -0.027128 | 0.016775 | 0.042438 | 0.101589 | ... | -0.009118 | NaN | 0.018422 | 0.148280 | -0.025100 | 0.009819 | 0.069114 | 0.060236 | 0.054254 | 0.069065 |
| EmployeeCount | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| EmployeeNumber | -0.010145 | -0.050990 | 0.032916 | 0.042070 | NaN | 1.000000 | 0.017621 | 0.035179 | -0.006888 | -0.018519 | ... | -0.069861 | NaN | 0.062227 | -0.014365 | 0.023603 | 0.010309 | -0.011240 | -0.008416 | -0.009019 | -0.009197 |
| EnvironmentSatisfaction | 0.010146 | 0.018355 | -0.016075 | -0.027128 | NaN | 0.017621 | 1.000000 | -0.049857 | -0.008278 | 0.001212 | ... | 0.007665 | NaN | 0.003432 | -0.002693 | -0.019359 | 0.027627 | 0.001458 | 0.018007 | 0.016194 | -0.004999 |
| HourlyRate | 0.024287 | 0.023381 | 0.031131 | 0.016775 | NaN | 0.035179 | -0.049857 | 1.000000 | 0.042861 | -0.027853 | ... | 0.001330 | NaN | 0.050263 | -0.002334 | -0.008548 | -0.004607 | -0.019582 | -0.024106 | -0.026716 | -0.020123 |
| JobInvolvement | 0.029820 | 0.046135 | 0.008783 | 0.042438 | NaN | -0.006888 | -0.008278 | 0.042861 | 1.000000 | -0.012630 | ... | 0.034297 | NaN | 0.021523 | -0.005533 | -0.015338 | -0.014617 | -0.021355 | 0.008717 | -0.024184 | 0.025976 |
| JobLevel | 0.509604 | 0.002966 | 0.005303 | 0.101589 | NaN | -0.018519 | 0.001212 | -0.027853 | -0.012630 | 1.000000 | ... | 0.021642 | NaN | 0.013984 | 0.782208 | -0.018191 | 0.037818 | 0.534739 | 0.389447 | 0.353885 | 0.375281 |
| JobSatisfaction | -0.004892 | 0.030571 | -0.003669 | -0.011296 | NaN | -0.046247 | -0.006784 | -0.071335 | -0.021476 | -0.001944 | ... | -0.012454 | NaN | 0.010690 | -0.020185 | -0.005779 | -0.019459 | -0.003803 | -0.002305 | -0.018214 | -0.027656 |
| MonthlyIncome | 0.497855 | 0.007707 | -0.017014 | 0.094961 | NaN | -0.014829 | -0.006259 | -0.015794 | -0.015271 | 0.950300 | ... | 0.025873 | NaN | 0.005408 | 0.772893 | -0.021736 | 0.030683 | 0.514285 | 0.363818 | 0.344978 | 0.344079 |
| MonthlyRate | 0.028051 | -0.032182 | 0.027473 | -0.026084 | NaN | 0.012648 | 0.037600 | -0.015297 | -0.016322 | 0.039563 | ... | -0.004085 | NaN | -0.034323 | 0.026442 | 0.001467 | 0.007963 | -0.023655 | -0.012815 | 0.001567 | -0.036746 |
| NumCompaniesWorked | 0.299635 | 0.038153 | -0.029251 | 0.126317 | NaN | -0.001251 | 0.012594 | 0.022157 | 0.015012 | 0.142501 | ... | 0.052733 | NaN | 0.030075 | 0.237639 | -0.066054 | -0.008366 | -0.118421 | -0.090754 | -0.036814 | -0.110319 |
| PercentSalaryHike | 0.003634 | 0.022704 | 0.040235 | -0.011111 | NaN | -0.012944 | -0.031701 | -0.009062 | -0.017205 | -0.034730 | ... | -0.040490 | NaN | 0.007528 | -0.020608 | -0.005221 | -0.003280 | -0.035991 | -0.001520 | -0.022154 | -0.011985 |
| PerformanceRating | 0.001904 | 0.000473 | 0.027110 | -0.024539 | NaN | -0.020359 | -0.029548 | -0.002172 | -0.029071 | -0.021222 | ... | -0.031351 | NaN | 0.003506 | 0.006744 | -0.015579 | 0.002572 | 0.003435 | 0.034986 | 0.017896 | 0.022827 |
| RelationshipSatisfaction | 0.053535 | 0.007846 | 0.006557 | -0.009118 | NaN | -0.069861 | 0.007665 | 0.001330 | 0.034297 | 0.021642 | ... | 1.000000 | NaN | -0.045952 | 0.024054 | 0.002497 | 0.019604 | 0.019367 | -0.015123 | 0.033493 | -0.000867 |
| StandardHours | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| StockOptionLevel | 0.037510 | 0.042143 | 0.044872 | 0.018422 | NaN | 0.062227 | 0.003432 | 0.050263 | 0.021523 | 0.013984 | ... | -0.045952 | NaN | 1.000000 | 0.010136 | 0.011274 | 0.004129 | 0.015058 | 0.050818 | 0.014352 | 0.024698 |
| TotalWorkingYears | 0.680381 | 0.014515 | 0.004628 | 0.148280 | NaN | -0.014365 | -0.002693 | -0.002334 | -0.005533 | 0.782208 | ... | 0.024054 | NaN | 0.010136 | 1.000000 | -0.035662 | 0.001008 | 0.628133 | 0.460365 | 0.404858 | 0.459188 |
| TrainingTimesLastYear | -0.019621 | 0.002453 | -0.036942 | -0.025100 | NaN | 0.023603 | -0.019359 | -0.008548 | -0.015338 | -0.018191 | ... | 0.002497 | NaN | 0.011274 | -0.035662 | 1.000000 | 0.028072 | 0.003569 | -0.005738 | -0.002067 | -0.004096 |
| WorkLifeBalance | -0.021490 | -0.037848 | -0.026556 | 0.009819 | NaN | 0.010309 | 0.027627 | -0.004607 | -0.014617 | 0.037818 | ... | 0.019604 | NaN | 0.004129 | 0.001008 | 0.028072 | 1.000000 | 0.012089 | 0.049856 | 0.008941 | 0.002759 |
| YearsAtCompany | 0.311309 | -0.034055 | 0.009508 | 0.069114 | NaN | -0.011240 | 0.001458 | -0.019582 | -0.021355 | 0.534739 | ... | 0.019367 | NaN | 0.015058 | 0.628133 | 0.003569 | 0.012089 | 1.000000 | 0.758754 | 0.618409 | 0.769212 |
| YearsInCurrentRole | 0.212901 | 0.009932 | 0.018845 | 0.060236 | NaN | -0.008416 | 0.018007 | -0.024106 | 0.008717 | 0.389447 | ... | -0.015123 | NaN | 0.050818 | 0.460365 | -0.005738 | 0.049856 | 0.758754 | 1.000000 | 0.548056 | 0.714365 |
| YearsSinceLastPromotion | 0.216513 | -0.033229 | 0.010029 | 0.054254 | NaN | -0.009019 | 0.016194 | -0.026716 | -0.024184 | 0.353885 | ... | 0.033493 | NaN | 0.014352 | 0.404858 | -0.002067 | 0.008941 | 0.618409 | 0.548056 | 1.000000 | 0.510224 |
| YearsWithCurrManager | 0.202089 | -0.026363 | 0.014406 | 0.069065 | NaN | -0.009197 | -0.004999 | -0.020123 | 0.025976 | 0.375281 | ... | -0.000867 | NaN | 0.024698 | 0.459188 | -0.004096 | 0.002759 | 0.769212 | 0.714365 | 0.510224 | 1.000000 |
26 rows × 26 columns
Lets now visualize the distribution of employes whio left the company vs those who stayed
plt.title("Distribution of Attrition in the dataset")
plt.pie(data.Attrition.value_counts(),
labels = data.Attrition.value_counts(),
colors = color)
plt.legend(data.Attrition.value_counts().index)
<matplotlib.legend.Legend at 0x26e69406610>
sns.pairplot(numericData)
<seaborn.axisgrid.PairGrid at 0x1d1d42ba1d0>
attritionYes = data[data.Attrition == 'Yes']
plt.title("Distribution of employees' age who were attritioned")
sns.histplot(attritionYes.Age, color = 'skyblue')
<AxesSubplot: title={'center': "Distribution of employees' age who were attritioned"}, xlabel='Age', ylabel='Count'>
plt.title("Distribution of employees' Monthly Income who were attritioned")
sns.histplot(attritionYes.MonthlyIncome, color = 'lightgreen')
<AxesSubplot: title={'center': "Distribution of employees' Monthly Income who were attritioned"}, xlabel='MonthlyIncome', ylabel='Count'>
plt.title("Distribution of employees' Percent of salary hike who were attritioned")
sns.histplot(attritionYes.PercentSalaryHike, color = 'orange')
<AxesSubplot: title={'center': "Distribution of employees' Percent of salary hike who were attritioned"}, xlabel='PercentSalaryHike', ylabel='Count'>
plt.title("Distribution of employees' years at the company before they were attritioned")
sns.histplot(attritionYes.YearsAtCompany, color = 'purple')
<AxesSubplot: title={'center': "Distribution of employees' years at the company before they were attritioned"}, xlabel='YearsAtCompany', ylabel='Count'>
plt.title("Scatter plot of employees' age v/s Monthly income")
sns.scatterplot(x = data.MonthlyIncome, y = data.Age, hue = data.Attrition)
<AxesSubplot: title={'center': "Scatter plot of employees' age v/s Monthly income"}, xlabel='MonthlyIncome', ylabel='Age'>
usefulData = data[['YearsAtCompany', 'PercentSalaryHike', 'MonthlyIncome', 'Age' ,'NumCompaniesWorked']]
sns.pairplot(usefulData)
<seaborn.axisgrid.PairGrid at 0x1d1fb8f7890>
usefulData.corr()
| YearsAtCompany | PercentSalaryHike | MonthlyIncome | Age | NumCompaniesWorked | |
|---|---|---|---|---|---|
| YearsAtCompany | 1.000000 | -0.035991 | 0.514285 | 0.311309 | -0.118421 |
| PercentSalaryHike | -0.035991 | 1.000000 | -0.027269 | 0.003634 | -0.010238 |
| MonthlyIncome | 0.514285 | -0.027269 | 1.000000 | 0.497855 | 0.149515 |
| Age | 0.311309 | 0.003634 | 0.497855 | 1.000000 | 0.299635 |
| NumCompaniesWorked | -0.118421 | -0.010238 | 0.149515 | 0.299635 | 1.000000 |
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1470 entries, 0 to 1469 Data columns (total 35 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Age 1470 non-null int64 1 Attrition 1470 non-null object 2 BusinessTravel 1470 non-null object 3 DailyRate 1470 non-null int64 4 Department 1470 non-null object 5 DistanceFromHome 1470 non-null int64 6 Education 1470 non-null int64 7 EducationField 1470 non-null object 8 EmployeeCount 1470 non-null int64 9 EmployeeNumber 1470 non-null int64 10 EnvironmentSatisfaction 1470 non-null int64 11 Gender 1470 non-null object 12 HourlyRate 1470 non-null int64 13 JobInvolvement 1470 non-null int64 14 JobLevel 1470 non-null int64 15 JobRole 1470 non-null object 16 JobSatisfaction 1470 non-null int64 17 MaritalStatus 1470 non-null object 18 MonthlyIncome 1470 non-null int64 19 MonthlyRate 1470 non-null int64 20 NumCompaniesWorked 1470 non-null int64 21 Over18 1470 non-null object 22 OverTime 1470 non-null object 23 PercentSalaryHike 1470 non-null int64 24 PerformanceRating 1470 non-null int64 25 RelationshipSatisfaction 1470 non-null int64 26 StandardHours 1470 non-null int64 27 StockOptionLevel 1470 non-null int64 28 TotalWorkingYears 1470 non-null int64 29 TrainingTimesLastYear 1470 non-null int64 30 WorkLifeBalance 1470 non-null int64 31 YearsAtCompany 1470 non-null int64 32 YearsInCurrentRole 1470 non-null int64 33 YearsSinceLastPromotion 1470 non-null int64 34 YearsWithCurrManager 1470 non-null int64 dtypes: int64(26), object(9) memory usage: 402.1+ KB
data.drop(['EmployeeCount', 'EmployeeNumber', 'Over18', 'StandardHours'], axis="columns", inplace=True)
data.shape
(1470, 31)
categorical_col = []
for column in data.columns:
if data[column].dtype == object and len(data[column].unique()) <= 50:
categorical_col.append(column)
categorical_col
['Attrition', 'BusinessTravel', 'Department', 'EducationField', 'Gender', 'JobRole', 'MaritalStatus', 'OverTime']
data['Attrition'] = data.Attrition.astype("category").cat.codes
categorical_col.remove('Attrition')
# categorical_col.remove('BusinessTravel')
label = LabelEncoder()
for column in categorical_col:
data[column] = label.fit_transform(data[column])
data.head()
| Age | Attrition | BusinessTravel | DailyRate | Department | DistanceFromHome | Education | EducationField | EnvironmentSatisfaction | Gender | ... | PerformanceRating | RelationshipSatisfaction | StockOptionLevel | TotalWorkingYears | TrainingTimesLastYear | WorkLifeBalance | YearsAtCompany | YearsInCurrentRole | YearsSinceLastPromotion | YearsWithCurrManager | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 41 | 1 | 2 | 1102 | 2 | 1 | 2 | 1 | 2 | 0 | ... | 3 | 1 | 0 | 8 | 0 | 1 | 6 | 4 | 0 | 5 |
| 1 | 49 | 0 | 1 | 279 | 1 | 8 | 1 | 1 | 3 | 1 | ... | 4 | 4 | 1 | 10 | 3 | 3 | 10 | 7 | 1 | 7 |
| 2 | 37 | 1 | 2 | 1373 | 1 | 2 | 2 | 4 | 4 | 1 | ... | 3 | 2 | 0 | 7 | 3 | 3 | 0 | 0 | 0 | 0 |
| 3 | 33 | 0 | 1 | 1392 | 1 | 3 | 4 | 1 | 4 | 0 | ... | 3 | 3 | 0 | 8 | 3 | 3 | 8 | 7 | 3 | 0 |
| 4 | 27 | 0 | 2 | 591 | 1 | 2 | 1 | 3 | 1 | 1 | ... | 3 | 4 | 1 | 6 | 3 | 3 | 2 | 2 | 2 | 2 |
5 rows × 31 columns
X = data.drop(['Attrition'], axis = 1)
y = data['Attrition']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
y_train
995 0
727 0
68 0
1420 0
692 0
..
1084 0
582 0
1356 0
1438 1
6 0
Name: Attrition, Length: 1176, dtype: int8
plt.title("Distribution of Attrition in the dataset")
plt.pie(data.Attrition.value_counts(),
labels = data.Attrition.value_counts(),
colors = color)
plt.legend(data.Attrition.value_counts().index)
<matplotlib.legend.Legend at 0x26e6ba61590>
Wealready know tat the dataset is imbalanced, so lets apply SMOTE technique to make it balanced.
smote = SMOTE(sampling_strategy='auto', random_state=42)
encoder = OneHotEncoder()
X_train_encoded = encoder.fit_transform(X_train)
X_test_encoded = encoder.fit_transform(X_test)
X_train_numeric = X_train.select_dtypes(include=['float64', 'int64'])
X_train_categorical = X_train.select_dtypes(include=['object'])
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_numeric, y_train)
X_test_encoded
<294x1198 sparse matrix of type '<class 'numpy.float64'>' with 8820 stored elements in Compressed Sparse Row format>
logistic_model = LogisticRegression(solver='liblinear')
logistic_model.fit(X_train_resampled, y_train_resampled)
X_test = X_test[X_train_resampled.columns]
y_pred = logistic_model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
Accuracy: 0.7482993197278912
Confusion Matrix:
[[197 54]
[ 20 23]]
Classification Report:
precision recall f1-score support
0 0.91 0.78 0.84 251
1 0.30 0.53 0.38 43
accuracy 0.75 294
macro avg 0.60 0.66 0.61 294
weighted avg 0.82 0.75 0.77 294
logistic_auc = roc_auc_score(y_test, y_pred)
# Generate ROC curve
fpr, tpr, _ = roc_curve(y_test, y_pred)
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label='Logistic Regression (AUC = %0.2f)' % logistic_auc, color = 'purple')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend()
plt.show()
from sklearn.model_selection import train_test_split, GridSearchCV
param_grid_rf = {
'n_estimators': [100, 200, 300],
'max_depth': [None, 10, 20],
'min_samples_split': [2, 5, 10]
}
grid_search_rf = GridSearchCV(RandomForestClassifier(random_state=42), param_grid_rf, cv=5, n_jobs=-1)
grid_search_rf.fit(X_train_resampled, y_train_resampled)
GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=42), n_jobs=-1,
param_grid={'max_depth': [None, 10, 20],
'min_samples_split': [2, 5, 10],
'n_estimators': [100, 200, 300]})In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=42), n_jobs=-1,
param_grid={'max_depth': [None, 10, 20],
'min_samples_split': [2, 5, 10],
'n_estimators': [100, 200, 300]})RandomForestClassifier(random_state=42)
RandomForestClassifier(random_state=42)
# X_test = X_test[X_train_resampled.columns]
y_pred = grid_search_rf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
Accuracy: 0.8299319727891157
Confusion Matrix:
[[231 20]
[ 30 13]]
Classification Report:
precision recall f1-score support
0 0.89 0.92 0.90 251
1 0.39 0.30 0.34 43
accuracy 0.83 294
macro avg 0.64 0.61 0.62 294
weighted avg 0.81 0.83 0.82 294
random_forest_auc = roc_auc_score(y_test, y_pred)
# Generate ROC curve
fpr, tpr, _ = roc_curve(y_test, y_pred)
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label='Grid Search-Random Forest Classifier (AUC = %0.2f)' % random_forest_auc, color = 'purple')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend()
plt.show()
xgb = XGBClassifier(random_state=7)
xgb.fit(X_train_resampled, y_train_resampled)
XGBClassifier(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, device=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, feature_types=None,
gamma=None, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=None, max_bin=None,
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=None, max_leaves=None,
min_child_weight=None, missing=nan, monotone_constraints=None,
multi_strategy=None, n_estimators=None, n_jobs=None,
num_parallel_tree=None, random_state=7, ...)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. XGBClassifier(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, device=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, feature_types=None,
gamma=None, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=None, max_bin=None,
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=None, max_leaves=None,
min_child_weight=None, missing=nan, monotone_constraints=None,
multi_strategy=None, n_estimators=None, n_jobs=None,
num_parallel_tree=None, random_state=7, ...)y_pred = xgb.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
Accuracy: 0.8197278911564626
Confusion Matrix:
[[225 26]
[ 27 16]]
Classification Report:
precision recall f1-score support
0 0.89 0.90 0.89 251
1 0.38 0.37 0.38 43
accuracy 0.82 294
macro avg 0.64 0.63 0.64 294
weighted avg 0.82 0.82 0.82 294
random_forest_auc = roc_auc_score(y_test, y_pred)
# Generate ROC curve
fpr, tpr, _ = roc_curve(y_test, y_pred)
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label='XGBoost Classifier (AUC = %0.2f)' % random_forest_auc, color = 'purple')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend()
plt.show()
Instead of using the SMOTE technique, I will resample the dataset myself.
data.head()
| Age | Attrition | BusinessTravel | DailyRate | Department | DistanceFromHome | Education | EducationField | EnvironmentSatisfaction | Gender | ... | PerformanceRating | RelationshipSatisfaction | StockOptionLevel | TotalWorkingYears | TrainingTimesLastYear | WorkLifeBalance | YearsAtCompany | YearsInCurrentRole | YearsSinceLastPromotion | YearsWithCurrManager | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 41 | 1 | 2 | 1102 | 2 | 1 | 2 | 1 | 2 | 0 | ... | 3 | 1 | 0 | 8 | 0 | 1 | 6 | 4 | 0 | 5 |
| 1 | 49 | 0 | 1 | 279 | 1 | 8 | 1 | 1 | 3 | 1 | ... | 4 | 4 | 1 | 10 | 3 | 3 | 10 | 7 | 1 | 7 |
| 2 | 37 | 1 | 2 | 1373 | 1 | 2 | 2 | 4 | 4 | 1 | ... | 3 | 2 | 0 | 7 | 3 | 3 | 0 | 0 | 0 | 0 |
| 3 | 33 | 0 | 1 | 1392 | 1 | 3 | 4 | 1 | 4 | 0 | ... | 3 | 3 | 0 | 8 | 3 | 3 | 8 | 7 | 3 | 0 |
| 4 | 27 | 0 | 2 | 591 | 1 | 2 | 1 | 3 | 1 | 1 | ... | 3 | 4 | 1 | 6 | 3 | 3 | 2 | 2 | 2 | 2 |
5 rows × 31 columns
from sklearn.utils import resample
xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size=0.3, random_state=42)
model = RandomForestClassifier()
model.fit(xtrain, ytrain)
ypred = model.predict(xtest)
accuracy = accuracy_score(ytest, ypred)
print(f'Accuracy: {accuracy}')
Accuracy: 0.8639455782312925
X, y = resample(X, y, replace=True, stratify=y)
xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size=0.3, random_state=42)
y_pred = model.predict(xtest)
print("Accuracy:", accuracy_score(ytest, y_pred))
print("Confusion Matrix:\n", confusion_matrix(ytest, y_pred))
print("Classification Report:\n", classification_report(ytest, y_pred))
Accuracy: 0.9546485260770975
Confusion Matrix:
[[378 2]
[ 18 43]]
Classification Report:
precision recall f1-score support
0 0.95 0.99 0.97 380
1 0.96 0.70 0.81 61
accuracy 0.95 441
macro avg 0.96 0.85 0.89 441
weighted avg 0.95 0.95 0.95 441
new_rf_auc = roc_auc_score(ytest, y_pred)
# Generate ROC curve
fpr, tpr, _ = roc_curve(ytest, y_pred)
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label='New Random Forest Classifier (AUC = %0.2f)' % new_rf_auc, color = 'purple')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend()
plt.show()
xgb.fit(xtrain, ytrain)
XGBClassifier(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, device=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, feature_types=None,
gamma=None, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=None, max_bin=None,
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=None, max_leaves=None,
min_child_weight=None, missing=nan, monotone_constraints=None,
multi_strategy=None, n_estimators=None, n_jobs=None,
num_parallel_tree=None, random_state=7, ...)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. XGBClassifier(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, device=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, feature_types=None,
gamma=None, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=None, max_bin=None,
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=None, max_leaves=None,
min_child_weight=None, missing=nan, monotone_constraints=None,
multi_strategy=None, n_estimators=None, n_jobs=None,
num_parallel_tree=None, random_state=7, ...)y_pred = xgb.predict(xtest)
print("Accuracy:", accuracy_score(ytest, y_pred))
print("Confusion Matrix:\n", confusion_matrix(ytest, y_pred))
print("Classification Report:\n", classification_report(ytest, y_pred))
Accuracy: 0.9591836734693877
Confusion Matrix:
[[377 3]
[ 15 46]]
Classification Report:
precision recall f1-score support
0 0.96 0.99 0.98 380
1 0.94 0.75 0.84 61
accuracy 0.96 441
macro avg 0.95 0.87 0.91 441
weighted avg 0.96 0.96 0.96 441
new_xgb_auc = roc_auc_score(ytest, y_pred)
# Generate ROC curve
fpr, tpr, _ = roc_curve(ytest, y_pred)
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label='New XGBosot Classifier (AUC = %0.2f)' % new_rf_auc, color = 'purple')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend()
plt.show()
abc = AdaBoostClassifier(n_estimators=50,
learning_rate=1)
model = abc.fit(xtrain, ytrain)
y_pred = model.predict(xtest)
print("Accuracy:", accuracy_score(ytest, y_pred))
print("Confusion Matrix:\n", confusion_matrix(ytest, y_pred))
print("Classification Report:\n", classification_report(ytest, y_pred))
Accuracy: 0.9229024943310657
Confusion Matrix:
[[372 8]
[ 26 35]]
Classification Report:
precision recall f1-score support
0 0.93 0.98 0.96 380
1 0.81 0.57 0.67 61
accuracy 0.92 441
macro avg 0.87 0.78 0.81 441
weighted avg 0.92 0.92 0.92 441
abc_auc = roc_auc_score(ytest, y_pred)
# Generate ROC curve
fpr, tpr, _ = roc_curve(ytest, y_pred)
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label='ADA Boost Classifier (AUC = %0.2f)' % abc_auc, color = 'purple')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend()
plt.show()
X_std = StandardScaler().fit_transform(X)
# Initialize factor analysis object
fa = FactorAnalysis(n_components=3, random_state=0)
fa.fit(X_std)
loadings = fa.components_.T
feature_names = X.columns
factor_names = [f"Factor {i+1}" for i in range(n_factors)]
loadings_df = pd.DataFrame(loadings, columns=factor_names, index=feature_names)
# Print the factor loadings
print("Factor Loadings:\n", loadings_df)
Factor Loadings:
Factor 1 Factor 2 Factor 3
Age 0.530641 -0.076421 0.047642
BusinessTravel 0.035199 -0.013354 -0.048325
DailyRate 0.014752 -0.056018 0.043069
Department 0.051011 -0.006220 0.033339
DistanceFromHome -0.015279 0.018852 0.042615
Education 0.096562 0.008010 0.062439
EducationField -0.018389 0.034419 0.003113
EnvironmentSatisfaction 0.001762 0.028274 -0.021741
Gender -0.029964 -0.005723 -0.041460
HourlyRate -0.040842 -0.023192 -0.021789
JobInvolvement 0.005951 0.000549 -0.029541
JobLevel 0.961464 -0.157041 0.003293
JobRole -0.082364 -0.034650 0.028849
JobSatisfaction 0.007491 0.013298 -0.015579
MaritalStatus -0.067451 0.010377 -0.029364
MonthlyIncome 0.958079 -0.187924 0.011524
MonthlyRate 0.038677 -0.049803 -0.003254
NumCompaniesWorked 0.140146 -0.243126 -0.001987
OverTime -0.035255 -0.022218 0.021213
PercentSalaryHike -0.063875 -0.019387 0.912270
PerformanceRating -0.033014 0.025016 0.866429
RelationshipSatisfaction 0.088917 0.045270 -0.061777
StockOptionLevel -0.042982 0.026418 0.035713
TotalWorkingYears 0.829761 0.087802 0.035045
TrainingTimesLastYear 0.029342 0.031088 -0.020535
WorkLifeBalance 0.056407 -0.013753 -0.001397
YearsAtCompany 0.662033 0.675190 -0.013795
YearsInCurrentRole 0.517017 0.638249 0.043816
YearsSinceLastPromotion 0.466992 0.466657 0.001490
YearsWithCurrManager 0.495566 0.660836 0.036522
important_features = {}
for factor in loadings_df.columns:
important_features[factor] = loadings_df.index[loadings_df[factor].abs() > 0.4].tolist()
# Printing the important features for each factor
print("Important Features:\n", important_features)
Important Features:
{'Factor 1': ['Age', 'JobLevel', 'MonthlyIncome', 'TotalWorkingYears', 'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager'], 'Factor 2': ['YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager'], 'Factor 3': ['PercentSalaryHike', 'PerformanceRating']}
plt.figure(figsize=(12, 10))
sns.heatmap(loadings_df, cmap="Blues", annot=True, cbar=True, center=0)
plt.title('Factor Loadings Heatmap')
plt.xlabel('Factors')
plt.ylabel('Features')
plt.show()
important_features_list = list(set([feature for features in important_features.values() for feature in features]))
# Make sure that 'X' is your original DataFrame that contains all the features
# Select only the important features from 'X'
X_important = X[important_features_list]
Lets split the data using just the important features
X_train, X_test, y_train, y_test = train_test_split(X_important, y, test_size=0.3, random_state=0)
# Initialize the prediction algorithm
model = RandomForestClassifier(random_state=0)
# Train the model
model.fit(X_train, y_train)
RandomForestClassifier(random_state=0)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
RandomForestClassifier(random_state=0)
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
Accuracy: 0.9047619047619048
Confusion Matrix:
[[358 13]
[ 29 41]]
Classification Report:
precision recall f1-score support
0 0.93 0.96 0.94 371
1 0.76 0.59 0.66 70
accuracy 0.90 441
macro avg 0.84 0.78 0.80 441
weighted avg 0.90 0.90 0.90 441
rfc_auc = roc_auc_score(ytest, y_pred)
# Generate ROC curve
fpr, tpr, _ = roc_curve(ytest, y_pred)
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label='Random Forest Classifier (AUC = %0.2f)' % abc_auc, color = 'purple')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve of Random Forest Classifier With Important Features')
plt.legend()
plt.show()
X_std = StandardScaler().fit_transform(X)
pca = PCA(n_components=2)
principalComponents = pca.fit_transform(X_std)
# Let's try to identify 3 clusters for this example
kmeans = KMeans(n_clusters=3, random_state=0).fit(principalComponents)
# Create a DataFrame for the PCA results
pca_df = pd.DataFrame(data=principalComponents, columns=['Principal Component 1', 'Principal Component 2'])
pca_df['Cluster'] = kmeans.labels_
# Plot the clusters
plt.figure(figsize=(8, 8))
sns.scatterplot(x='Principal Component 1', y='Principal Component 2', hue='Cluster', data=pca_df, palette='viridis')
plt.title('Clusters identified by PCA-reduced data')
plt.show()
C:\Python311\Lib\site-packages\sklearn\cluster\_kmeans.py:1416: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning super()._check_params_vs_input(X, default_n_init=10)
from sklearn.metrics import silhouette_score
# We will use the silhouette score to find the optimal number of clusters. We will test for 2 to 6 clusters.
silhouette_scores = []
range_n_clusters = list(range(2, 7))
for n_clusters in range_n_clusters:
kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(principalComponents)
cluster_labels = kmeans.labels_
# Calculate silhouette score and append to list
silhouette_avg = silhouette_score(principalComponents, cluster_labels)
silhouette_scores.append(silhouette_avg)
# Plotting these silhouette scores
plt.figure(figsize=(10, 5))
sns.lineplot(x=range_n_clusters, y=silhouette_scores, marker='o')
plt.title('Silhouette Scores for Various Numbers of Clusters')
plt.xlabel('Number of Clusters')
plt.ylabel('Silhouette Score')
plt.xticks(range_n_clusters)
plt.show()
C:\Python311\Lib\site-packages\sklearn\cluster\_kmeans.py:1416: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning super()._check_params_vs_input(X, default_n_init=10) C:\Python311\Lib\site-packages\sklearn\cluster\_kmeans.py:1416: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning super()._check_params_vs_input(X, default_n_init=10) C:\Python311\Lib\site-packages\sklearn\cluster\_kmeans.py:1416: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning super()._check_params_vs_input(X, default_n_init=10) C:\Python311\Lib\site-packages\sklearn\cluster\_kmeans.py:1416: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning super()._check_params_vs_input(X, default_n_init=10) C:\Python311\Lib\site-packages\sklearn\cluster\_kmeans.py:1416: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning super()._check_params_vs_input(X, default_n_init=10)
kmeans = KMeans(n_clusters=2, random_state=0).fit(principalComponents)
# Create a DataFrame for the PCA results
pca_df = pd.DataFrame(data=principalComponents, columns=['Principal Component 1', 'Principal Component 2'])
# Add the cluster labels
pca_df['Cluster'] = kmeans.labels_
# Plot the clusters
plt.figure(figsize=(8, 8))
sns.scatterplot(x='Principal Component 1', y='Principal Component 2', hue='Cluster', data=pca_df, palette='viridis')
plt.title('Clusters identified by PCA-reduced data')
plt.show()
C:\Python311\Lib\site-packages\sklearn\cluster\_kmeans.py:1416: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning super()._check_params_vs_input(X, default_n_init=10)